import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from datetime import datetime, timedelta
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
from sklearn import mixture
from sklearn.neighbors import KernelDensity
import plotly.io as pio
import warnings
warnings.filterwarnings('ignore')
pio.renderers.default = "notebook"
data = pd.read_csv('data.csv', encoding="ISO-8859-1")
data.head()
# Remove inconsistent data
data = data.drop((data[data['Quantity']<=0]).index)
data = data.drop((data[data['UnitPrice']<=0]).index)
# Get total purchases per date
data['Total'] = data.Quantity * data.UnitPrice
# Change to datetime
data.InvoiceDate = pd.to_datetime(data.InvoiceDate)
unique_dates = (data.InvoiceDate.dt.date).unique()
# Create date column
data['Date'] = data['InvoiceDate'].dt.date
# Create month column
data['Month'] = data['InvoiceDate'].dt.to_period('M')
data.head()
# Counts sales per countries
counts_countries = data.groupby('Country')['Total'].sum()
# Plot
fig = go.Figure([go.Bar(x=counts_countries.keys().to_list(), y=counts_countries.to_list())])
fig.update_layout(barmode='group')
fig.show()
# Counts sales per countries
counts_products = data.groupby('StockCode')['InvoiceNo'].nunique()
list_counts = np.array(counts_products.to_list())
list_products = np.array(counts_products.keys().to_list())
sort_index = np.argsort(list_counts)
# Plot
fig = go.Figure([go.Table(header=dict(values=['Product', 'Number of Invoices']),
cells=dict(values=[list_products[sort_index], list_counts[sort_index]]))
])
fig.show()
# Counts sales per client
counts_clients = data.groupby('CustomerID')['InvoiceNo'].count()
list_counts = np.array(counts_clients.to_list())
list_clients = np.array(counts_clients.keys().to_list())
sort_index = np.argsort(list_counts)
# Plot
fig = go.Figure([go.Table(header=dict(values=['Client ID', 'Number of Invoices']),
cells=dict(values=[list_clients[sort_index], list_counts[sort_index]]))
])
fig.show()
# Get totals per month
unique_dates = data['Month'].dt.start_time.dt.date.unique()
total = data.groupby('Month')['Total'].sum()
# Scatter Data
data_fig = go.Scatter(x=unique_dates, y=total, name="Sales", line_color='deepskyblue')
# Layout
layout = go.Layout(
title=go.layout.Title(
text="<b>Total Sales</b>",
xref='paper',
x=0.5,
y = 0.9
),
xaxis=go.layout.XAxis(
title=go.layout.xaxis.Title(
text='<b>Time</b>',
font=dict(
family='Courier New, monospace',
size=18,
color='#000000'
)
)
),
yaxis=go.layout.YAxis(
title=go.layout.yaxis.Title(
text='<b>Sales</b>',
font=dict(
family='Courier New, monospace',
size=18,
color='#000000'
)
)
)
)
# Make figure
fig = go.Figure(data=data_fig, layout=layout)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
num_days_predict = 360
num_sigma = 2
k_fold = 10
# Get Dates an origin
origin = unique_dates[0].toordinal()
# Training Data
X = np.array([datetime.toordinal(date) - origin for date in unique_dates])
X = X.reshape(X.shape + (1,))
y = total
# Test Data
X_test = np.array([i for i in range(X[-1][0], X[-1][0] + num_days_predict)])
X_test = X_test.reshape(X_test.shape + (1,))
X_test_dates = np.array([datetime.fromordinal(x[0]+origin) for x in X_test])
# Model Selection
grid = GridSearchCV(BayesianRidge(tol=0.0001, fit_intercept=True, compute_score=True),
{},
cv=LeaveOneOut(),
n_jobs=-1)
# Train and predict
grid.fit(X, y)
reg = grid.best_estimator_
y_mean_train, y_stdv_train = reg.predict(X, return_std=True)
y_mean, y_stdv = reg.predict(X_test, return_std=True)
# Scatter Data
source = go.Scatter(x=unique_dates,
y=total, name="Sales",
mode='lines+markers',
line_color='deepskyblue',
line={'dash': 'solid'})
predicted_up = go.Scatter(x=unique_dates,
y=y_mean_train+num_sigma*y_stdv_train,
name="Predicted Upper",
mode='lines',
line_color='red',
line={'dash': 'dash'},
fill='tonexty',
fillcolor='rgba(255, 0, 0, 0.1)')
predicted_low = go.Scatter(x=unique_dates,
y=y_mean_train-num_sigma*y_stdv_train,
name="Predicted Lower",
mode='lines',
line_color='red',
line={'dash': 'dash'})
predicted_mean = go.Scatter(x=unique_dates,
y=y_mean_train,
name="Predicted Mean",
mode='lines',
line_color='red',
line={'dash': 'solid'})
forecast_up = go.Scatter(x=X_test_dates,
y=y_mean+num_sigma*y_stdv,
name="Forecast Upper",
line_color='green',
line={'dash': 'dash'})
forecast_low = go.Scatter(x=X_test_dates,
y=y_mean-num_sigma*y_stdv,
name="Forecast Lower",
line_color='green',
line={'dash': 'dash'},
fill='tonexty',
fillcolor='rgba(0, 255, 0, 0.1)')
forecast_mean = go.Scatter(x=X_test_dates,
y=y_mean,
name="Forecast Mean",
line_color='green',
line={'dash': 'solid'})
data_fig = [source, predicted_low, predicted_up, forecast_up, forecast_low, predicted_mean, forecast_mean]
# Layout
layout = go.Layout(
title=go.layout.Title(
text="<b>Total Sales</b>",
xref='paper',
x=0.5,
y = 0.9
),
xaxis=go.layout.XAxis(
title=go.layout.xaxis.Title(
text='<b>Time</b>',
font=dict(
family='Courier New, monospace',
size=18,
color='#000000'
)
)
),
yaxis=go.layout.YAxis(
title=go.layout.yaxis.Title(
text='<b>Sales</b>',
font=dict(
family='Courier New, monospace',
size=18,
color='#000000'
)
)
)
)
# Make figure
fig = go.Figure(data=data_fig, layout=layout)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
product_id = '85123A'
# Get totals per month
product = data.loc[data['StockCode']==product_id]
unique_dates = data['Month'].dt.start_time.dt.date.unique()
total_product = product.groupby('Month')['Quantity'].sum()
# Scatter Data
data_fig = go.Scatter(x=unique_dates, y=total_product, name="Demand {}".format(product_id), line_color='deepskyblue')
# Layout
layout = go.Layout(
title=go.layout.Title(
text="<b>Demand {}</b>".format(product_id),
xref='paper',
x=0.5,
y = 0.9
),
xaxis=go.layout.XAxis(
title=go.layout.xaxis.Title(
text='<b>Time</b>',
font=dict(
family='Courier New, monospace',
size=18,
color='#000000'
)
)
),
yaxis=go.layout.YAxis(
title=go.layout.yaxis.Title(
text='<b>Demand</b>',
font=dict(
family='Courier New, monospace',
size=18,
color='#000000'
)
)
)
)
# Make figure
fig = go.Figure(data=data_fig, layout=layout)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
num_sigma = 2
num_days_predict = 360
# Training Data
origin = unique_dates[0].toordinal()
X = np.array([datetime.toordinal(date) - origin for date in unique_dates])
X = X.reshape(X.shape + (1,))
y = total_product
# Test Data
X_test = np.array([i for i in range(X[-1][0], X[-1][0] + num_days_predict)])
X_test = X_test.reshape(X_test.shape + (1,))
X_test_dates = np.array([datetime.fromordinal(x[0]+origin) for x in X_test])
reg = BayesianRidge(tol=1e-6, fit_intercept=True, compute_score=True)
reg.fit(X, y)
y_mean_train, y_stdv_train = reg.predict(X, return_std=True)
y_mean, y_stdv = reg.predict(X_test, return_std=True)
# Scatter Data
source = go.Scatter(x=unique_dates,
y=total_product, name="Sales",
mode='lines+markers',
line_color='deepskyblue',
line={'dash': 'solid'})
predicted_up = go.Scatter(x=unique_dates,
y=y_mean_train+num_sigma*y_stdv_train,
name="Predicted Upper",
mode='lines',
line_color='red',
line={'dash': 'dash'},
fill='tonexty',
fillcolor='rgba(255, 0, 0, 0.1)')
predicted_low = go.Scatter(x=unique_dates,
y=y_mean_train-num_sigma*y_stdv_train,
name="Predicted Lower",
mode='lines',
line_color='red',
line={'dash': 'dash'})
predicted_mean = go.Scatter(x=unique_dates,
y=y_mean_train,
name="Predicted Mean",
mode='lines',
line_color='red',
line={'dash': 'solid'})
forecast_up = go.Scatter(x=X_test_dates,
y=y_mean+num_sigma*y_stdv,
name="Forecast Upper",
line_color='green',
line={'dash': 'dash'})
forecast_low = go.Scatter(x=X_test_dates,
y=y_mean-num_sigma*y_stdv,
name="Forecast Lower",
line_color='green',
line={'dash': 'dash'},
fill='tonexty',
fillcolor='rgba(0, 255, 0, 0.1)')
forecast_mean = go.Scatter(x=X_test_dates,
y=y_mean,
name="Forecast Mean",
line_color='green',
line={'dash': 'solid'})
data_fig = [source, predicted_low, predicted_up, forecast_up, forecast_low, predicted_mean, forecast_mean]
# Layout
layout = go.Layout(
title=go.layout.Title(
text="<b>Demand for Most Demanded Product</b>",
xref='paper',
x=0.5,
y = 0.9
),
xaxis=go.layout.XAxis(
title=go.layout.xaxis.Title(
text='<b>Time</b>',
font=dict(
family='Courier New, monospace',
size=18,
color='#000000'
)
)
),
yaxis=go.layout.YAxis(
title=go.layout.yaxis.Title(
text='<b>Demand</b>',
font=dict(
family='Courier New, monospace',
size=18,
color='#000000'
)
)
)
)
# Make figure
fig = go.Figure(data=data_fig, layout=layout)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
# Top three customers
customer_ids = [17841, 14911, 14096]
data_fig = []
# Dictionary of deltas
dt_dict = {}
for c in customer_ids:
dt_dict[c] = []
unique_dates = {}
for i, customer_id in enumerate(customer_ids):
# Get customer dates
customer = data.loc[data['CustomerID']==customer_id]
unique_dates[customer_id] = (customer.InvoiceDate.dt.date).unique()
# Get dates differences
dt_days = [(unique_dates[customer_id][i+1] - unique_dates[customer_id][i]).days for i in range(len(unique_dates[customer_id])-1)]
dt_dict[customer_id] = np.array(dt_days)
# Histograms
call_frequency = go.Histogram(x=dt_days,
name='Customer {}'.format(i+1),
opacity=0.5)
data_fig.append(call_frequency)
# Layout
layout = go.Layout(
title=go.layout.Title(
text="<b>Histogram of days between calls</b>",
xref='paper',
x=0.5,
y = 0.9
)
)
# Sub titles
subplot_titles = tuple(['<b>Customer ' + str(i+1) + '</b>' for i in range(0, len(customer_ids))])
# Create figure
fig = make_subplots(rows=len(customer_ids),
cols=1,
subplot_titles=subplot_titles
)
for i, d in enumerate(data_fig):
fig.append_trace(d, i+1, 1)
fig.update_xaxes(title_text="<b>Days between Calls</b>", row=i+1, col=1)
fig.update_yaxes(title_text="<b>Frequency</b>", row=i+1, col=1)
fig.update_layout(height=650)
fig.show()
n_samples = 1000
# Hyper-parameters
k_folds = 10
bandwidths = 10 ** np.linspace(-1, 1, 100)
kernels = ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']
metrics = ['euclidean', 'minkowski']
# Model selection for the all customers
best_kdes = {}
xs = {}
for k in dt_dict.keys():
#Generate arrays
xs[k] = np.linspace(0, max(dt_dict[k]), n_samples)
xs[k] = np.reshape(xs[k], xs[k].shape + (1,))
# Create Gaussian Mixture Model
grid = GridSearchCV(KernelDensity(),
{'bandwidth': bandwidths, 'kernel':kernels, 'metric':metrics},
cv=k_folds,
n_jobs=-1)
dt = np.reshape(dt_dict[k], dt_dict[k].shape + (1,))
grid.fit(dt)
best_kdes[k] = grid.best_estimator_
# Get PDF for all customers
pdfs = {}
for k in best_kdes.keys():
logprob = best_kdes[k].score_samples(xs[k])
pdfs[k] = np.exp(logprob)
pdfs[k] = pdfs[k][pdfs[k]<=1]
# Get each histogram
data_figs = []
f = ff.create_distplot(list(dt_dict.values()), list(subplot_titles), show_rug=False, show_curve=False)
fig = go.FigureWidget(f)
# Sub titles
subplot_titles = tuple(['<b>Customer ' + str(i+1) + '</b>' for i in range(0, len(customer_ids))])
# Create figure
fig = make_subplots(rows=len(customer_ids),
cols=1,
subplot_titles=subplot_titles)
for i, k in enumerate(dt_dict.keys()):
fig.add_trace(go.Histogram(x=dt_dict[k],
histnorm='probability',
name='<b>Mixture Customer {}</b>'.format(i+1)),
row=i+1,
col=1)
fig.add_trace(go.Scatter(x=xs[k][:,0],
y=pdfs[k],
name='<b>Mixture Customer {}</b>'.format(i+1)),
row=i+1,
col=1)
fig.update_layout(height=500)
fig.show()
range_days = 2
n_samples = 1000
# Initialize
num_days = {}
dt_possible = {}
dt_days = {}
possible_date_time = {}
# Get Interval of Days
for k in unique_dates.keys():
num_days[k] = np.random.randint(4, 10)
possible_date_time[k] = unique_dates[k][-1] + timedelta(num_days[k])
dt_possible[k] = possible_date_time[k] - unique_dates[k][-1]
dt_days[k] = dt_possible[k].days
# Initialize
pdf_samples = {}
samples = {}
proba = {}
lim_sup = {}
lim_inf = {}
min_lim = {}
max_lim = {}
# Compute probabilities for each customer
for k in dt_days.keys():
min_lim[k] = dt_days[k] - range_days
max_lim[k] = dt_days[k] + range_days
samples[k] = np.linspace(min_lim[k], max_lim[k], n_samples)
logprob = best_kdes[k].score_samples(samples[k].reshape(len(samples[k]),1))
pdf_samples[k] = np.exp(logprob)
# Remove inconsistencies
indices = pdf_samples[k]<=1
samples[k] = samples[k][indices]
pdf_samples[k] = pdf_samples[k][indices]
# Compute probabilities
proba[k] = np.trapz(pdf_samples[k], samples[k])
# Get interval
lim_inf[k] = possible_date_time[k] - timedelta(range_days)
lim_sup[k] = possible_date_time[k] + timedelta(range_days)
# Get each histogram
data_figs = []
f = ff.create_distplot(list(dt_dict.values()), list(subplot_titles), show_rug=False, show_curve=False)
fig = go.FigureWidget(f)
# Sub titles
subplot_titles = tuple(['<b>Customer ' + str(i+1) + '</b>' for i in range(0, len(customer_ids))])
# Create figure
fig = make_subplots(rows=len(customer_ids),
cols=1,
subplot_titles=subplot_titles)
for i, k in enumerate(dt_dict.keys()):
fig.add_trace(go.Scatter(x=xs[k][:,0],
y=pdfs[k],
name='<b>Mixture Customer {}</b>'.format(i+1)),
row=i+1,
col=1)
fig.add_scatter(x=samples[k],
y=pdf_samples[k],
fill='tozeroy',
name='<b>Predicted Customer {}</b>'.format(i),
row=i+1,
col=1)
fig.add_annotation(x=min_lim[k],
y=max(1.1*pdfs[k]),
text="<b>Probability of call between {0} and {1} = {2:1.2f}</b>".format(lim_inf[k],
lim_sup[k],
proba[k]),
showarrow=False,
row=i+1,
col=1)
fig.update_layout(height=550)
fig.show()